import pandas as pd
import numpy as np
import warnings
pd.set_option('precision', 2)
warnings.filterwarnings('ignore')
import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()
from matplotlib import pyplot as plt
import seaborn as sns
from functools import reduce
import scipy
from scipy.stats import t, beta as b, gamma
import pymc3 as pm
import logging
logger = logging.getLogger('pymc3')
logger.setLevel(logging.ERROR)
import arviz as az
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import StackingRegressor
import shap
shap.initjs()
df = pd.read_csv("./analytical_take_home_data_v3.csv")
print(df.shape)
df.head()
(15474, 4)
| treatment | company_size | campaign_spend | campaign_budget | |
|---|---|---|---|---|
| 0 | False | small | 10.45 | 3.90 |
| 1 | False | medium | 3.78 | 1.99 |
| 2 | False | medium | 46.19 | 55.45 |
| 3 | False | small | 6.63 | 6.51 |
| 4 | False | small | 92.34 | 83.10 |
# Create indicators of overspend (and underdspend for back-up analysis)
df['overspend'] = df['overspend_ratio'] = df['underspend'] = df['underspend_ratio'] = ""
for i in range(len(df)):
if df['campaign_budget'][i] < df['campaign_spend'][i]:
df['overspend'][i] = df['campaign_spend'][i] - df['campaign_budget'][i]
df['overspend_ratio'][i] = df['overspend'][i] / df['campaign_budget'][i]
df['underspend'][i] = df['underspend_ratio'][i]= 0
elif df['campaign_budget'][i] > df['campaign_spend'][i]:
df['overspend'][i] = df['overspend_ratio'][i] = 0
df['underspend'][i] = df['campaign_budget'][i] - df['campaign_spend'][i]
df['underspend_ratio'][i] = df['underspend'][i] / df['campaign_budget'][i]
else:
df['overspend'][i] = df['overspend_ratio'][i] = df['underspend'][i] = df['underspend_ratio'][i] = 0
df['overspend'] = pd.to_numeric(df['overspend'])
df['overspend_ratio'] = pd.to_numeric(df['overspend_ratio'])
df['underspend'] = pd.to_numeric(df['underspend'])
df['underspend_ratio'] = pd.to_numeric(df['underspend_ratio'])
# Define overspnd campaign as >1% of their budget
df['over_one_percent'] = list(map(lambda x: 1 if (x>0.01) else 0, df['overspend_ratio']))
df['under_one_percent'] = list(map(lambda x: 1 if (x>0.01) else 0, df['underspend_ratio']))
# Create log version for spend and budget for visualization
df['spend_log'], df['budget_log'] = np.log(df['campaign_spend']), np.log(df['campaign_budget'])
df.head()
| treatment | company_size | campaign_spend | campaign_budget | overspend | overspend_ratio | underspend | underspend_ratio | over_one_percent | under_one_percent | spend_log | budget_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | small | 10.45 | 3.90 | 6.54 | 1.68 | 0.00 | 0.00 | 1 | 0 | 2.35 | 1.36 |
| 1 | False | medium | 3.78 | 1.99 | 1.79 | 0.90 | 0.00 | 0.00 | 1 | 0 | 1.33 | 0.69 |
| 2 | False | medium | 46.19 | 55.45 | 0.00 | 0.00 | 9.26 | 0.17 | 0 | 1 | 3.83 | 4.02 |
| 3 | False | small | 6.63 | 6.51 | 0.11 | 0.02 | 0.00 | 0.00 | 1 | 0 | 1.89 | 1.87 |
| 4 | False | small | 92.34 | 83.10 | 9.24 | 0.11 | 0.00 | 0.00 | 1 | 0 | 4.53 | 4.42 |
# Check data quality
table = []
for col in df.columns:
table.append((col, df[col].dtype, df[col].nunique(), df.shape[0] - (df[col].isnull().sum()), round(100 - (df[col].isnull().sum()*100/df.shape[0]), 2)))
pd.DataFrame(table, columns=['Column name','Type','Unique value', 'Available Value','% Availability']).set_index('Column name').reset_index()
| Column name | Type | Unique value | Available Value | % Availability | |
|---|---|---|---|---|---|
| 0 | treatment | bool | 2 | 15474 | 100.0 |
| 1 | company_size | object | 3 | 15474 | 100.0 |
| 2 | campaign_spend | float64 | 15375 | 15474 | 100.0 |
| 3 | campaign_budget | float64 | 15367 | 15474 | 100.0 |
| 4 | overspend | float64 | 11816 | 15474 | 100.0 |
| 5 | overspend_ratio | float64 | 11923 | 15474 | 100.0 |
| 6 | underspend | float64 | 3495 | 15474 | 100.0 |
| 7 | underspend_ratio | float64 | 3495 | 15474 | 100.0 |
| 8 | over_one_percent | int64 | 2 | 15474 | 100.0 |
| 9 | under_one_percent | int64 | 2 | 15474 | 100.0 |
| 10 | spend_log | float64 | 15375 | 15474 | 100.0 |
| 11 | budget_log | float64 | 15367 | 15474 | 100.0 |
# Check outlisers
cont = df[df.treatment==False]
treat = df[df.treatment==True]
g1 = alt.Chart(cont, title='Control budget').mark_boxplot().encode(x='company_size:O', y=alt.Y('campaign_budget:Q', title='')).properties(width=80,height=300)
g2 = alt.Chart(treat, title='Treatment budget').mark_boxplot().encode(x='company_size:O', y=alt.Y('campaign_budget:Q', title='')).properties(width=80,height=300)
g3 = alt.Chart(cont, title='Control spend').mark_boxplot().encode(x='company_size:O', y=alt.Y('campaign_spend:Q', title='')).properties(width=80,height=300)
g4 = alt.Chart(treat, title='Treatment spend').mark_boxplot().encode(x='company_size:O', y=alt.Y('campaign_spend:Q', title='')).properties(width=80,height=300)
g5 = alt.Chart(cont, title='Control overspend').mark_boxplot().encode(x='company_size:O', y=alt.Y('overspend:Q', title='')).properties(width=80,height=300)
g6 = alt.Chart(treat, title='Treatment overspend').mark_boxplot().encode(x='company_size:O', y=alt.Y('overspend:Q', title='')).properties(width=80,height=300)
g1 | g2 | g3 | g4 | g5 | g6
# Remove outliers of spend, budget, and overspend to generalize the analysis
df2 = df[(df.campaign_budget < 10000000) & (df.campaign_spend < 3000000) & (df.overspend < 250000)]
print('Before\n', df.treatment.value_counts(), '\n')
print('After\n', df2.treatment.value_counts())
Before True 7741 False 7733 Name: treatment, dtype: int64 After True 7739 False 7733 Name: treatment, dtype: int64
# Quick sanitary check of company split between test groups
alt.Chart(
df2, title='Company size proportion by test group').mark_bar().encode(
alt.X('treatment:O'),
alt.Y('count():Q', title='count of campaigns'),
color=alt.Color('company_size',scale=alt.Scale(scheme='tealblues')),
tooltip=[alt.Tooltip('count():Q')]).properties(width=400,height=200)
# Create dataframe object for analysis by subgroups
cont = df2[df2.treatment==False]
treat = df2[df2.treatment==True]
cl, cm, cs = cont[cont.company_size=='large'], cont[cont.company_size=='medium'], cont[cont.company_size=='small']
tl, tm, ts = treat[treat.company_size=='large'], treat[treat.company_size=='medium'], treat[treat.company_size=='small']
# Means of numerical indicators
num_cols = ['campaign_budget','over_one_percent','overspend_ratio','under_one_percent', 'underspend_ratio']
df2.groupby(['treatment'])[num_cols].mean()
| campaign_budget | over_one_percent | overspend_ratio | under_one_percent | underspend_ratio | |
|---|---|---|---|---|---|
| treatment | |||||
| False | 4641.83 | 0.74 | 0.31 | 0.19 | 0.06 |
| True | 5350.74 | 0.67 | 0.26 | 0.26 | 0.08 |
df2.groupby(['treatment','company_size'])[num_cols].mean()
| campaign_budget | over_one_percent | overspend_ratio | under_one_percent | underspend_ratio | ||
|---|---|---|---|---|---|---|
| treatment | company_size | |||||
| False | large | 5933.97 | 0.69 | 0.19 | 0.23 | 0.07 |
| medium | 6342.86 | 0.61 | 0.13 | 0.27 | 0.08 | |
| small | 3538.66 | 0.79 | 0.42 | 0.15 | 0.05 | |
| True | large | 12655.71 | 0.62 | 0.13 | 0.30 | 0.10 |
| medium | 4612.67 | 0.60 | 0.15 | 0.31 | 0.10 | |
| small | 1585.48 | 0.71 | 0.35 | 0.23 | 0.07 |
# Median of numerical indicators as some of the variables are right skewed
num_cols = ['campaign_budget','overspend_ratio']
df2.groupby(['treatment'])[num_cols].median()
| campaign_budget | overspend_ratio | |
|---|---|---|
| treatment | ||
| False | 65.38 | 0.06 |
| True | 38.58 | 0.05 |
df2.groupby(['treatment','company_size'])[num_cols].median()
| campaign_budget | overspend_ratio | ||
|---|---|---|---|
| treatment | company_size | ||
| False | large | 145.38 | 0.04 |
| medium | 95.23 | 0.02 | |
| small | 28.99 | 0.13 | |
| True | large | 129.57 | 0.03 |
| medium | 90.59 | 0.02 | |
| small | 20.29 | 0.08 |
corr0 = cont.loc[:, cont.columns.difference(['treatment','spend_log','budget_log','underspend','under_one_percent','underspend_ratio'])].corr()
mask0 = np.zeros_like(corr0, dtype=np.bool)
mask0[np.triu_indices_from(mask0)] = True
corr1 = treat.loc[:, treat.columns.difference(['treatment','spend_log','budget_log','underspend','under_one_percent','underspend_ratio'])].corr()
mask1 = np.zeros_like(corr1, dtype=np.bool)
mask1[np.triu_indices_from(mask1)] = True
fix, ax = plt.subplots(ncols=2, figsize=(16, 5))
sns.heatmap(corr0, mask=mask0, square=True, linewidths=.5, ax=ax[0], annot=True, cmap="GnBu").set_title("Control correlation map")
sns.heatmap(corr1, mask=mask1, square=True, linewidths=.5, ax=ax[1], annot=True, cmap="GnBu").set_title("Treatment correlation map")
Text(0.5, 1.0, 'Treatment correlation map')
# Check the overspend campaign proportion performance
print('Before processing\n',
df.groupby(df.treatment).sum()[['over_one_percent']].reset_index().rename(columns={'over_one_percent':'# of overspend campaigns'}),'\n')
print('After processing\n',
df2.groupby(df2.treatment).sum()[['over_one_percent']].reset_index().rename(columns={'over_one_percent':'# of overspend campaigns'}),'\n')
cmap = plt.cm.GnBu
colors = cmap(np.linspace(0.5,0.9, len([1,5,8])))
fig, ax = plt.subplots(ncols=2, figsize=(10,5))
ax[0].pie(cont.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[0].set_title('Control', fontsize=14)
ax[0].set_ylabel('Overspend campaigns', fontsize=14)
ax[1].pie(treat.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[1].set_title('Treatment', fontsize=14)
plt.tight_layout()
Before processing
treatment # of overspend campaigns
0 False 5716
1 True 5180
After processing
treatment # of overspend campaigns
0 False 5716
1 True 5179
# Check by company size
fig, ax = plt.subplots(ncols=3, figsize=(12,5))
ax[0].pie(cl.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[0].set_title('Control large companies', fontsize=14)
ax[0].set_ylabel('Overspend campaigns', fontsize=14)
ax[1].pie(cm.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[1].set_title('Control medium companies', fontsize=14)
ax[2].pie(cs.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[2].set_title('Control small companies', fontsize=14)
plt.tight_layout()
fig, ax = plt.subplots(ncols=3, figsize=(12,5))
ax[0].pie(tl.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[0].set_title('Treatment large companies', fontsize=14)
ax[0].set_ylabel('Overspend campaigns', fontsize=14)
ax[1].pie(tm.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[1].set_title('Treatment medium companies', fontsize=14)
ax[2].pie(ts.over_one_percent.value_counts(), labels=['Yes','No'], autopct='%1.0f%%', colors=colors)
ax[2].set_title('Treatment small companies', fontsize=14)
plt.tight_layout()
sns.displot(df2.sort_values(by=["company_size"]), x="over_one_percent", hue="treatment", col="company_size", kind="kde", fill=True)
plt.show()
over = df2[df2.over_one_percent==True]
non_over = df2[df2.over_one_percent==False]
fig, ax = plt.subplots(ncols=2, figsize=(10,5))
ax[0].pie(over.company_size.value_counts().sort_index(), labels=['large','medium','small'], autopct='%1.0f%%', colors=colors)
ax[0].set_title('Overspend campains', fontsize=14)
ax[0].set_ylabel('Company size proportion', fontsize=14)
ax[1].pie(non_over.company_size.value_counts().sort_index(), labels=['large','medium','small'], autopct='%1.0f%%', colors=colors)
ax[1].set_title('Non overspend campains', fontsize=14)
plt.tight_layout()
g1 = alt.Chart(
cont,title='Control group').mark_circle(opacity=0.8,strokeWidth=1).encode(
x=alt.X('budget_log'),
y=alt.Y('spend_log'),
#size='budget_log',
color='company_size').properties(width=350,height=300)
line1 = pd.DataFrame({
'spend_log': [min(cont.spend_log), max(cont.spend_log)],
'budget_log': [min(cont.budget_log), max(cont.budget_log)]})
line1 = alt.Chart(line1).mark_line(color='grey').encode(
x='budget_log',
y='spend_log')
g2 = alt.Chart(
treat,title='Treatment group').mark_circle(opacity=0.8,
strokeWidth=1).encode(
x=alt.X('budget_log'),
y=alt.Y('spend_log'),
#size='budget_log',
color='company_size').properties(width=350,height=300)
line2 = pd.DataFrame({
'spend_log': [min(treat.spend_log), max(treat.spend_log)],
'budget_log': [min(treat.budget_log), max(treat.budget_log)]})
line2 = alt.Chart(line2).mark_line(color='grey').encode(
x='budget_log',
y='spend_log')
g1+line1 | g2+line2
sns.displot(df2.sort_values(by=["company_size"]), x="overspend_ratio", hue="treatment", col="company_size", col_order=['large','medium','small'], kind="kde", fill=True)
plt.show()
g = alt.Chart(df2, title='Overspend ratio').mark_boxplot().encode(x='overspend_ratio:Q', y='company_size:O').properties(width=400,height=180)
g
# boxplot
g1 = alt.Chart(cont, title='Control overspend ratio').mark_boxplot().encode(x='overspend_ratio:Q', y='company_size:O').properties(width=400,height=180)
g2 = alt.Chart(treat, title='Treatment overspend ratio').mark_boxplot().encode(x='overspend_ratio:Q', y='company_size:O').properties(width=400,height=180)
g1 | g2
sns.displot(df2.sort_values(by=["company_size"]), x="budget_log", hue="treatment", col="company_size", kind="kde", fill=True)
<seaborn.axisgrid.FacetGrid at 0x7fcd8297f250>
g1 = alt.Chart(
cont, title='Control log budget').mark_boxplot().encode(
x='budget_log:Q', y='company_size:O').properties(width=400,height=100)
bar1 = g1.mark_rule().encode(x='min(budget_log)', x2='max(budget_log)')
g2 = alt.Chart(
treat, title='Treatment log budget').mark_boxplot().encode(
x='budget_log:Q', y='company_size:O').properties(width=400,height=100)
bar2 = g2.mark_rule().encode(x='min(budget_log)', x2='max(budget_log)')
g1+bar1 | g2+bar2
g1 = alt.Chart(
df2, title='Control log budget').mark_boxplot().encode(
x='budget_log:Q', y='company_size:O').properties(width=400,height=100)
bar1 = g1.mark_rule().encode(x='min(budget_log)', x2='max(budget_log)')
g1+bar1
df_dummy = pd.get_dummies(df2[['company_size', 'over_one_percent', 'budget_log']])
df_dummy.drop(['company_size_medium'], axis=1, inplace=True)
train, test = train_test_split(df_dummy, test_size=0.2, random_state=0)
X_train = train.drop(['over_one_percent'], axis=1).astype(float)
y_train = train['over_one_percent'].astype(float)
X_test = test.drop(['over_one_percent'], axis=1).astype(float)
y_test = test['over_one_percent'].astype(float)
logit = sm.Logit(y_train, X_train).fit()
yhat = logit.predict(X_test)
pred = list(map(round, yhat))
print('Accuracy: ', accuracy_score(y_test, pred))
Optimization terminated successfully.
Current function value: 0.587590
Iterations 5
Accuracy: 0.6962843295638126
logit.summary()
| Dep. Variable: | over_one_percent | No. Observations: | 12377 |
|---|---|---|---|
| Model: | Logit | Df Residuals: | 12374 |
| Method: | MLE | Df Model: | 2 |
| Date: | Fri, 26 Nov 2021 | Pseudo R-squ.: | 0.03570 |
| Time: | 19:18:22 | Log-Likelihood: | -7272.6 |
| converged: | True | LL-Null: | -7541.8 |
| Covariance Type: | nonrobust | LLR p-value: | 1.164e-117 |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| budget_log | -0.1499 | 0.007 | -20.690 | 0.000 | -0.164 | -0.136 |
| company_size_large | 1.4428 | 0.052 | 27.863 | 0.000 | 1.341 | 1.544 |
| company_size_small | 1.6421 | 0.040 | 41.105 | 0.000 | 1.564 | 1.720 |
xgb = xgboost.XGBClassifier(objective='binary:logistic', n_estimators=12, eta=0.1, grow_policy='lossguide',
reg_lambda=5, reg_alpha=5, eval_metric=["logloss"]).fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred_xgb))
explainer = shap.TreeExplainer(xgb)
shap_values_xgb = explainer.shap_values(X_train)
shap.force_plot(explainer.expected_value, shap_values_xgb[0,:], X_train.iloc[0,:])
shap.summary_plot(shap_values_xgb, X_train)
Accuracy: 0.7408723747980613
shap.dependence_plot('budget_log', shap_values_xgb, X_train)
shap.dependence_plot('company_size_small', shap_values_xgb, X_train)
large, small = df2[df2.company_size=='large'], df2[df2.company_size=='small']
df_dummy = pd.get_dummies(df2[['company_size', 'budget_log', 'treatment']])
df_dummy.drop(['company_size_medium'], axis=1, inplace=True)
train, test = train_test_split(df_dummy, test_size=0.1, random_state=42)
X_train = train.drop(['treatment'], axis=1).astype(float)
y_train = train['treatment'].astype(float)
X_test = test.drop(['treatment'], axis=1).astype(float)
y_test = test['treatment'].astype(float)
xgb2 = xgboost.XGBClassifier(objective='binary:logistic', n_estimators=20, eta=0.1, grow_policy='lossguide',
reg_lambda=5, reg_alpha=5, eval_metric=["logloss"]).fit(X_train, y_train)
pred_xgb = xgb2.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, pred_xgb))
explainer = shap.TreeExplainer(xgb2)
shap_values_xgb = explainer.shap_values(X_train)
shap.force_plot(explainer.expected_value, shap_values_xgb[0,:], X_train.iloc[0,:])
shap.summary_plot(shap_values_xgb, X_train)
Accuracy: 0.5633074935400517
shap.dependence_plot('budget_log', shap_values_xgb, X_train)
shap.dependence_plot('company_size_small', shap_values_xgb, X_train)
shap.dependence_plot('company_size_large', shap_values_xgb, X_train)
tmp1 = df2.groupby(['treatment','company_size']).mean()['over_one_percent'].reset_index()
print('='*45, '\n >1% overspend campaign probability \n', '='*45)
pd.pivot_table(
tmp1, values=['over_one_percent'], index='company_size',
columns='treatment', aggfunc='mean').transpose()[:2]
============================================= >1% overspend campaign probability =============================================
| company_size | large | medium | small | |
|---|---|---|---|---|
| treatment | ||||
| over_one_percent | False | 0.69 | 0.61 | 0.79 |
| True | 0.62 | 0.60 | 0.71 |
tmp2 = df2.groupby(['treatment','company_size']).mean()['overspend_ratio'].reset_index()
print('='*45, '\n Average overspend ratio per campaign \n', '='*45)
pd.pivot_table(
tmp2, values=['overspend_ratio'], index='company_size',
columns='treatment', aggfunc='mean').transpose()[:2]
============================================= Average overspend ratio per campaign =============================================
| company_size | large | medium | small | |
|---|---|---|---|---|
| treatment | ||||
| overspend_ratio | False | 0.19 | 0.13 | 0.42 |
| True | 0.13 | 0.15 | 0.35 |
def t_test(group1, group2, metric, test, alternative, title, alpha=0.02):
'''
Manual two samples t-test for proportion and mean
Parameters:
- group1: control group (dataframe)
- group2: treatment group (dataframe)
- metric: metric to test (text)
- test: test type, proportion or mean (text)
- side: tail to chose (text)
- title: title of the test (text)
- alpha: significance level
Return:
- Summary of parametors
- Statisitcal conclusion
'''
# Step 1: Calc the proportions of provided datasets, delta, and standard error
n_c = group1[metric].count()
n_t = group2[metric].count()
if test == 'proportion':
p_c = group1[metric].sum() / n_c
p_t = group2[metric].sum() / n_t
delta = p_t - p_c
p = (group1[metric].sum() + group2[metric].sum()) / (n_c+n_t)
se = np.sqrt((p*(1-p)) * (1/n_c+1/n_t))
elif test == 'mean':
p_c = group1[metric].mean()
p_t = group2[metric].mean()
delta = p_t - p_c
std_c = np.sqrt(np.var(group1[metric], ddof=1))
std_t = np.sqrt(np.var(group2[metric], ddof=1))
std_p = np.sqrt(((n_c-1) * std_c**2 + (n_t-1)*std_t**2) / (n_c+n_t-2))
se = std_p * np.sqrt(1/n_c+1/n_t)
# Step 2: Calc t-statistic, critical value, confidence interval, and p-value
t_stat = delta / se
dof = (n_c+n_t-2)
critical_val = scipy.stats.t.ppf(1-alpha, df=dof) # one side critical value
p_val = round(1 - scipy.stats.t.cdf(abs(t_stat), df=dof), 3)
if alternative == 'less':
p_val = round(scipy.stats.t.cdf(t_stat, df=dof), 3)
conf_int = ["-∞", round(delta+(critical_val*se), 3)]
elif alternative == 'greater':
p_val = round(1 - scipy.stats.t.cdf(t_stat, df=dof), 3)
conf_int = [round(delta+(critical_val*se), 3), "∞"]
elif alternative == 'two-sided':
p_val = (1 - scipy.stats.t.cdf(abs(t_stat), df=dof)) * 2
conf_int = [round(delta-(critical_val*se), 3), round(delta+(critical_val*se), 3)]
# Step 3: Summarize the analysis and output the conclusion
print('-' * 52)
print(f'T-test: {title}')
print('-' * 52, '\n')
print('-' * 22, 'Result', '-' * 22)
print(f'Control: {round(p_c, 3)}')
print(f'Treatment: {round(p_t, 3)}')
print(f'Delta: {round(delta, 3)}')
print(f'Standard error: {round(se, 3)}')
print(f'T-statistic: {round(t_stat, 3)}')
print(f'Critical value: {round(critical_val, 3)}')
print(f'P-value: {p_val}')
print(f'Alpha: {alpha}')
if p_val < alpha:
print('\nReject the null hypothesis.')
else:
print('\nFail to reject the null hypothesis.')
t_test(cont, treat, 'over_one_percent', 'proportion', 'less',
'All companies overspend campaign proportion', alpha=0.05)
---------------------------------------------------- T-test: All companies overspend campaign proportion ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.739 Treatment: 0.669 Delta: -0.07 Standard error: 0.007 T-statistic: -9.533 Critical value: 1.645 P-value: 0.0 Alpha: 0.05 Reject the null hypothesis.
t_test(cont, treat, 'overspend_ratio', 'mean', 'less', 'All companies average overspend ratio')
---------------------------------------------------- T-test: All companies average overspend ratio ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.313 Treatment: 0.259 Delta: -0.054 Standard error: 0.011 T-statistic: -4.967 Critical value: 2.054 P-value: 0.0 Alpha: 0.02 Reject the null hypothesis.
t_test(cont, treat, 'campaign_budget', 'mean', 'two-sided', 'All companies average budget')
---------------------------------------------------- T-test: All companies average budget ---------------------------------------------------- ---------------------- Result ---------------------- Control: 4641.828 Treatment: 5350.744 Delta: 708.917 Standard error: 855.598 T-statistic: 0.829 Critical value: 2.054 P-value: 0.4073647671463272 Alpha: 0.02 Fail to reject the null hypothesis.
t_test(cl, tl, 'over_one_percent', 'proportion', 'less', 'Large companies overspend campaign proportion')
---------------------------------------------------- T-test: Large companies overspend campaign proportion ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.694 Treatment: 0.615 Delta: -0.079 Standard error: 0.013 T-statistic: -5.956 Critical value: 2.054 P-value: 0.0 Alpha: 0.02 Reject the null hypothesis.
t_test(cl, tl, 'overspend_ratio', 'mean', 'less', 'Large companies average overspend ratio')
---------------------------------------------------- T-test: Large companies average overspend ratio ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.193 Treatment: 0.129 Delta: -0.064 Standard error: 0.013 T-statistic: -5.111 Critical value: 2.054 P-value: 0.0 Alpha: 0.02 Reject the null hypothesis.
t_test(cl, tl, 'campaign_budget', 'mean', 'two-sided', 'Large companies average budget')
---------------------------------------------------- T-test: Large companies average budget ---------------------------------------------------- ---------------------- Result ---------------------- Control: 5933.969 Treatment: 12655.707 Delta: 6721.738 Standard error: 2163.16 T-statistic: 3.107 Critical value: 2.054 P-value: 0.0018979008624264715 Alpha: 0.02 Reject the null hypothesis.
t_test(cm, tm, 'over_one_percent', 'proportion', 'less', 'Medium companies overspend campaign proportion')
---------------------------------------------------- T-test: Medium companies overspend campaign proportion ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.615 Treatment: 0.598 Delta: -0.017 Standard error: 0.026 T-statistic: -0.645 Critical value: 2.056 P-value: 0.259 Alpha: 0.02 Fail to reject the null hypothesis.
t_test(cm, tm, 'overspend_ratio', 'mean', 'less', 'Medium companies average overspend ratio')
---------------------------------------------------- T-test: Medium companies average overspend ratio ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.129 Treatment: 0.149 Delta: 0.02 Standard error: 0.02 T-statistic: 1.042 Critical value: 2.056 P-value: 0.851 Alpha: 0.02 Fail to reject the null hypothesis.
t_test(cm, tm, 'campaign_budget', 'mean', 'less', 'Medium companies average budget')
---------------------------------------------------- T-test: Medium companies average budget ---------------------------------------------------- ---------------------- Result ---------------------- Control: 6342.855 Treatment: 4612.671 Delta: -1730.184 Standard error: 1510.173 T-statistic: -1.146 Critical value: 2.056 P-value: 0.126 Alpha: 0.02 Fail to reject the null hypothesis.
t_test(cs, ts, 'over_one_percent', 'proportion', 'less', 'Small companies overspend campaign proportion')
---------------------------------------------------- T-test: Small companies overspend campaign proportion ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.789 Treatment: 0.709 Delta: -0.08 Standard error: 0.009 T-statistic: -8.697 Critical value: 2.054 P-value: 0.0 Alpha: 0.02 Reject the null hypothesis.
t_test(cs, ts, 'overspend_ratio', 'mean', 'less', 'Small companies average overspend ratio')
---------------------------------------------------- T-test: Small companies average overspend ratio ---------------------------------------------------- ---------------------- Result ---------------------- Control: 0.42 Treatment: 0.345 Delta: -0.075 Standard error: 0.017 T-statistic: -4.47 Critical value: 2.054 P-value: 0.0 Alpha: 0.02 Reject the null hypothesis.
t_test(cs, ts, 'campaign_budget', 'mean', 'two-sided', 'Small companies average budget')
---------------------------------------------------- T-test: Small companies average budget ---------------------------------------------------- ---------------------- Result ---------------------- Control: 3538.656 Treatment: 1585.48 Delta: -1953.176 Standard error: 764.954 T-statistic: -2.553 Critical value: 2.054 P-value: 0.010686568319221879 Alpha: 0.02 Reject the null hypothesis.
# Check the distribution of indicators to choose prior and likelihood function
partitions = np.array_split(df2, 100)
prob_o = []
for p in partitions:
prob_o.append(p['over_one_percent'].mean())
fig, ax = plt.subplots(ncols=2, nrows=3, figsize=(15, 12))
sns.histplot(prob_o, kde=True, ax=ax[0,0])
ax[0,0].set_title('Histogram of control')
ax[0,0].set_ylabel('Overspend probability')
# Bernoulli distribution with Beta for overspend probability
x = np.linspace(0.5, 1, 1000)
weak, strong = b(20, 8), b(40, 15)
ax[0,1].plot(x, weak.pdf(x), label=f'weak({20}, {8})')
ax[0,1].plot(x, strong.pdf(x), label=f'strong({40}, {15})')
ax[0,1].set_title('Prior assumptions')
ax[0,1].legend()
sns.histplot(df2.overspend_ratio, kde=True, ax=ax[1,0])
ax[1,0].set_ylabel('Average overspend ratio')
ax[1,0].set_xlabel('')
# Poisson distribution with Gamma for overspend ratio
x = np.linspace(0, 4, 1000)
weak, strong = gamma(1.5, 0.1), gamma(1, 0.1)
ax[1,1].plot(x, weak.pdf(x), label=f'weak({1.5}, {0.1})')
ax[1,1].plot(x, strong.pdf(x), label=f'strong({1}, {0.1})')
ax[1,1].legend()
partitions = np.array_split(df2, 100)
mean_b = []
for p in partitions:
mean_b.append(p['campaign_budget'].mean())
sns.histplot(mean_b, kde=True, ax=ax[2,0])
ax[2,0].set_ylabel('Average campaign budget')
# Poisson distribution with Gamma for budget
x = np.linspace(0,20000,50)
weak, strong = gamma(1.1, 0.6), gamma(1, 0.5)
ax[2,1].plot(x, weak.pdf(x), label=f'weak({1}, {0.1})')
ax[2,1].plot(x, strong.pdf(x), label=f'strong({1.5}, {0.1})')
ax[2,1].legend()
plt.tight_layout()
def bayesian_ab(group1, group2, metric, distribution, winning_method, title):
'''
Bayesian statistic calculation for overspend probability and ratio
Parameters:
- group1: control group (dataframe)
- group2: treatment group (dataframe)
- metric: to test (text)
- distribution: type of posterior distribution, bernoulli or poisson (text)
- winning_method: reduce or increase metric over control (text)
- title: title of the test (text)
Return:
- Posterior distributions, delta vs control, probability of winning vs control, and expected loss
'''
# Step 1: Set parameters, prior, likelihood function, and draw posterior for both control and treatment
if distribution == 'bernoulli':
with pm.Model() as model:
alpha = pm.Uniform("alpha", lower=0, upper=100)
beta = pm.Uniform("beta", lower=0, upper=100)
control = pm.Beta('control', alpha=alpha, beta=beta)
treatment = pm.Beta('treatment', alpha=alpha, beta=beta)
control_like = pm.Bernoulli('control_like', p=control, observed=group1[metric])
treatment_like = pm.Bernoulli('treatment_like', p=treatment, observed=group2[metric])
elif distribution == 'poisson':
with pm.Model() as model:
k = pm.Uniform("k", lower=0, upper=5)
theta = pm.Uniform("theta", lower=0, upper=5)
control = pm.Gamma('control', alpha=k, beta=theta)
treatment = pm.Gamma('treatment', alpha=k, beta=theta)
control_like = pm.Poisson('control_like', control, observed=group1[metric])
treatment_like = pm.Poisson('treatment_like', treatment, observed=group2[metric])
with model:
improvement = pm.Deterministic('improvement', treatment - control)
# draw samples from posterior distribution with 95% acceptance ratio
trace = pm.sample(2000, tune=10000, target_accept=0.95, chains=4, step=[pm.Metropolis(), pm.NUTS()])
summary = pm.summary(trace)
# Step 2: Calc winning probability over control and expected losses
L = list(zip(trace['treatment'], trace['control']))
n_c, n_t = group1[metric].count(), group2[metric].count()
if winning_method == 'reduce':
wins = trace['treatment'] < trace['control']
loss_c = map(lambda L: np.max([L[1]-L[0], 0]), L)
loss_t = map(lambda L: np.max([L[0]-L[1], 0]), L)
loss_sum_c = reduce(lambda i, j : i+j, loss_c)
exp_loss_c = loss_sum_c / n_c
loss_sum_t = reduce(lambda i, j : i+j, loss_t)
exp_loss_t = loss_sum_t / n_t
elif winning_method == 'increase':
wins = trace['treatment'] > trace['control']
loss_c = map(lambda L: np.max([L[0]-L[1], 0]), L)
loss_t = map(lambda L: np.max([L[1]-L[0], 0]), L)
loss_sum_c = reduce(lambda i, j : i+j, loss_c)
exp_loss_c = loss_sum_c / n_c
loss_sum_t = reduce(lambda i, j : i+j, loss_t)
exp_loss_t = loss_sum_t / n_t
print('-' * 55)
print(title)
print('-' * 55)
print(f'Probability of beating control: {np.mean(wins) * 100}%')
print(f'Expected loss of choosing control: {round(exp_loss_c, 2)}, treatment: {round(exp_loss_t, 2)}')
# Step 3: Plot posterior distributions
with model:
az.plot_forest(trace, var_names=['control', 'treatment'])
pm.plot_posterior(trace, var_names=['control', 'treatment', 'improvement'])
plt.show()
return summary, trace
summary_oa, trace_oa = bayesian_ab(cont, treat, 'over_one_percent', 'bernoulli', 'reduce', 'All companies overspend probability control vs treatment')
------------------------------------------------------- All companies overspend probability control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.07, treatment: 0.0
summary_ra, trace_ra = bayesian_ab(cont, treat, 'overspend_ratio', 'poisson', 'reduce', 'All companies overspend ratio control vs treatment')
------------------------------------------------------- All companies overspend ratio control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.03, treatment: 0.0
summary_ba, trace_ba = bayesian_ab(cont, treat, 'campaign_budget', 'poisson', 'increase', 'All companies campaign budget control vs treatment')
------------------------------------------------------- All companies campaign budget control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 733.42, treatment: 0.0
n_c, n_t = cont['over_one_percent'].count(), treat['over_one_percent'].count()
true_c, true_t = cont['over_one_percent'].sum(), treat['over_one_percent'].sum()
false_c, false_t = n_c - true_c, n_t - true_t
# Take mean value as parametors
alpha_m, beta_m = summary_oa.loc['alpha','mean'], summary_oa.loc['beta','mean']
# Draw posterior distribution
cont_pos = b(true_c + alpha_m, false_c + beta_m)
treat_pos = b(true_t + alpha_m, false_t + beta_m)
fig, ax = plt.subplots(figsize=(10,5))
x = np.linspace(0,1,5000)
ax.plot(x, cont_pos.pdf(x), label='control')
ax.plot(x, treat_pos.pdf(x), label='treatment')
ax.set_xlabel('Overspend probability')
ax.set_ylabel('Density')
ax.set_title('Posterior distribution')
ax.legend()
<matplotlib.legend.Legend at 0x7fcce6c3f160>
# Take mean value as parametors
k_m, theta_m = summary_ra.loc['k','mean'], summary_ra.loc['theta','mean']
ratio_c, ratio_t = cont.overspend_ratio.mean(), treat.overspend_ratio.mean()
# Draw posterior distribution
cont_pos = gamma(a=(true_c + k_m), scale=(theta_m / (1 + theta_m * true_c * ratio_c)))
treat_pos = gamma(a=(true_t + k_m), scale=(theta_m / (1 + theta_m * true_t * ratio_t)))
fig, ax = plt.subplots(figsize=(10,5))
x = np.linspace(0,5,5000)
z = [1/i for i in x] # Average overspend ratio
ax.plot(x, cont_pos.pdf(z), label='control')
ax.plot(x, treat_pos.pdf(z), label='treatment')
ax.set_xlabel('Overspend ratio')
ax.set_ylabel('Density')
ax.set_title('Posterior distribution')
ax.legend()
<matplotlib.legend.Legend at 0x7fcd72fb3430>
summary_ol, trace_ol = bayesian_ab(cl, tl, 'over_one_percent', 'bernoulli', 'reduce', 'Large companies overspend probability control vs treatment')
------------------------------------------------------- Large companies overspend probability control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.23, treatment: 0.0
summary_rl, trace_rl = bayesian_ab(cl, tl, 'overspend_ratio', 'poisson', 'reduce', 'Large companies overspend ratio control vs treatment')
------------------------------------------------------- Large companies overspend ratio control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.1, treatment: 0.0
summary_bl, trace_bl = bayesian_ab(cl, tl, 'campaign_budget', 'poisson', 'increase', 'Large compannies campaign budget control vs treatment')
------------------------------------------------------- Large compannies campaign budget control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 19908.82, treatment: 0.0
summary_om, trace_om = bayesian_ab(cm, tm, 'over_one_percent', 'bernoulli', 'reduce', 'Medium companies overspend probability control vs treatment')
------------------------------------------------------- Medium companies overspend probability control vs treatment ------------------------------------------------------- Probability of beating control: 73.75% Expected loss of choosing control: 0.21, treatment: 0.04
summary_rm, trace_rm = bayesian_ab(cm, tm, 'overspend_ratio', 'poisson', 'reduce', 'Medium companies overspend ratio control vs treatment')
------------------------------------------------------- Medium companies overspend ratio control vs treatment ------------------------------------------------------- Probability of beating control: 23.1625% Expected loss of choosing control: 0.02, treatment: 0.11
summary_bm, trace_bm = bayesian_ab(cm, tm, 'campaign_budget', 'poisson', 'increase', 'Medium companies campaign budget control vs treatment')
------------------------------------------------------- Medium companies campaign budget control vs treatment ------------------------------------------------------- Probability of beating control: 0.0% Expected loss of choosing control: 0.0, treatment: 20002.85
summary_os, trace_os = bayesian_ab(cs, ts, 'over_one_percent', 'bernoulli', 'reduce', 'Small companies overspend probability control vs treatment')
------------------------------------------------------- Small companies overspend probability control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.15, treatment: 0.0
summary_rs, trace_rs = bayesian_ab(cs, ts, 'overspend_ratio', 'poisson', 'reduce', 'Small companies overspend ratio control vs treatment')
------------------------------------------------------- Small companies overspend ratio control vs treatment ------------------------------------------------------- Probability of beating control: 100.0% Expected loss of choosing control: 0.07, treatment: 0.0
summary_bs, trace_bs = bayesian_ab(cs, ts, 'campaign_budget', 'poisson', 'increase', 'Small companies campaign budget control vs treatment')
------------------------------------------------------- Small companies campaign budget control vs treatment ------------------------------------------------------- Probability of beating control: 0.0% Expected loss of choosing control: 0.0, treatment: 3393.91